GROUP TASK

Pablo Aísa Serranos, Irene Bosque Gala, Diego Fernández Álvarez, Mafalda González González, Sophie Kersten, Irantzu Lamarca Flores, David Pereiro Pol, Gür Piren

Data cleaning

Code
important_parties <- c(
  "PARTIDO SOCIALISTA OBRERO ESPAÑOL",
  "PARTIDO POPULAR",
  "CIUDADANOS",
  "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO",
  "BLOQUE NACIONALISTA GALEGO",
  "CONVERGÈNCIA I UNIÓ",
  "UNIDAS PODEMOS - IU",
  "ESQUERRA REPUBLICANA DE CATALUNYA",
  "EH - BILDU",
  "MÁS PAÍS",
  "VOX"
)

election_data_tidy <- election_data |> 
  pivot_longer(cols = -(1:15), names_to = "party", values_to = "votes") |> 
  mutate(
    party_recoded = case_when(
      str_detect(party, "PARTIDO SOCIALISTA OBRERO ESPAÑOL|PARTIT DELS SOCIALISTES DE CATALUNYA|PARTIDO SOCIALISTA DE EUSKADI|PARTIDO DOS SOCIALISTAS DE GALICIA") ~ "PARTIDO SOCIALISTA OBRERO ESPAÑOL",
      str_detect(party, "PARTIDO DE LA CIUDADANIA|PARTIDO DE LA CIUDADANÍA") ~ "CIUDADANOS-PARTIDO DE LA CIUDADANIA",
      str_detect(party, "EH - BILDU|ARALAR|ALTERNATIBA|EUSKO ALKARTASUNA") ~ "EUSKAL HERRIA BILDU",
      str_detect(party, "UNIDAS PODEMOS|EN MAREA|PODEM|EZKER BATUA|IZQUIERDA UNIDA|ESQUERRA UNIDA|ESQUERDA UNIDA") ~ "PODEMOS",
      str_detect(party, "CONVERGÈNCIA I UNIÓ|CONVERGENCIA I UNIO|DEMOCRÀCIA I LLIBERTAT|CONVERGÈNCIA i UNIÓ ") ~ "CONVERGENCIA I UNIO",
      str_detect(party, "BLOQUE NACIONALISTA GALEGO|CANDIDATURA GALEGA") ~ "BLOQUE NACIONALISTA GALEGO",
      str_detect(party, "PARTIDO POPULAR") ~ "PARTIDO POPULAR",
      str_detect(party, "MÁS PAÍS") ~ "MÁS PAÍS",
      str_detect(party, "ESQUERRA REPUBLICANA DE CATALUNYA|ESQUERRA REPUBLICANA/CATALUNYA") ~ "ESQUERRA REPUBLICANA DE CATALUNYA",
      party %in% important_parties ~ party,
      TRUE ~ "OTHER"
    ),
    date = glue("{anno}-{mes}-01") |> as_date()
  ) |> 
  unite("cod_mun", codigo_ccaa, codigo_provincia, codigo_municipio, sep = "-", remove = FALSE) |> 
  left_join(
    abbrev |> 
      distinct(denominacion, .keep_all = TRUE) |> 
      mutate(siglas = case_when(
        siglas == "C's" ~ "CS",
        siglas == "EH Bildu" ~ "EH BILDU",
        siglas == "M PAÍS" ~ "MP",
        TRUE ~ siglas
      )), 
    by = c("party_recoded" = "denominacion")
  ) |> 
  left_join(cod_mun, by = "cod_mun") |> 
  select(-vuelta, -tipo_eleccion, -codigo_distrito_electoral) |> 
  drop_na(votes) |> 
  mutate(
    siglas = if_else(is.na(siglas),"OTHER", siglas)
  )

surveys_tidy <- surveys |> 
  pivot_longer(cols = -(1:10), names_to = "party", values_to = "estimation")  |>
  filter(
    year(date_elec) >= 2008,
    exit_poll == FALSE,
    size >= 750,
    field_date_to - field_date_from >= 1
  ) |>
  select(-type_survey) |>
  drop_na(size) 

Question 1

Which party was the winner in the municipalities with more than 100,000 habitants (census) in each of the elections?

Code
# Filter
large_municipalities <- election_data_tidy |>
  filter(censo > 100000)

# Winning parties
winners <- large_municipalities |> 
  group_by(date, municipio) |> 
  slice_max(votes, n = 1, with_ties = FALSE) |> 
  select(date, party_recoded, municipio, censo)

# Number of municipalities won
winners_by_party <- winners |> 
  group_by(date, party_recoded)|> 
  summarize(num_municipalities = n(), .groups = "drop") |> 
  arrange(date, desc(num_municipalities))
Code
library(showtext)
font_add_google("Roboto Condensed", "Roboto")
# Date as factor
winners$date <- factor(winners$date, levels = unique(winners$date))

gmun <- ggplot(winners, aes(x = date, y = municipio, fill = party_recoded)) +
  geom_tile(color = "white") +
  scale_fill_manual(
    values = c(
      c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "CONVERGENCIA I UNIO" = "#1b348a")
      ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "OTHER",
      "PODEMOS" = "PODEMOS",
      "VOX" = "VOX",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "CONVERGENCIA I UNIO" = "CiU"
    )
  ) +
  labs(
    title = "Winning party in municipalities with more than 100,000 habitants",
    x = "Date of election",
    y = "Municipality",
    fill = "Parties"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines"),
  ) 
Code
general_election_winners <- data.frame(
  date = as.Date(c("2008-03-01", "2011-11-01", "2015-12-01", "2016-06-01", "2019-04-01", "2019-11-01")),
  party_recoded = c(
    "PARTIDO SOCIALISTA OBRERO ESPAÑOL",  
    "PARTIDO POPULAR",                    
    "PARTIDO POPULAR",                    
    "PARTIDO POPULAR",                    
    "PARTIDO SOCIALISTA OBRERO ESPAÑOL",  
    "PARTIDO SOCIALISTA OBRERO ESPAÑOL")  
)

# End_date column
general_election_winners$end_date <- as.Date(c(
  "2011-11-01", "2015-12-01", "2016-06-01", "2019-04-01", "2019-11-01", "2019-11-01"
))

# Graph with the election winner colour as background
gwin <- ggplot(winners_by_party, aes(x = date, y = num_municipalities, color = party_recoded)) +
  geom_rect(data = general_election_winners,
            aes(xmin = date, xmax = end_date, ymin = -Inf, ymax = Inf, fill = party_recoded),
            alpha = 0.1, inherit.aes = FALSE) +
  geom_line(size = 1) +
  geom_point(size = 3) +  
  scale_color_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "CONVERGENCIA I UNIO" = "#1b348a"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "VOX" = "Vox",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "CONVERGENCIA I UNIO" = "CiU"
    ))  +
  scale_fill_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE")
  ) +
  geom_vline(data = general_election_winners, aes(xintercept = as.numeric(date)),
             color = "gray50", linetype = "dashed", size = 0.4) +
  labs(
    title = "Evolution of winning party in municipalities with more than 100,000 habitants",
    x = "Date of Election",
    y = "Number of Municipalities",
    color = "Winner in each municipality",
    fill = "General Election Winner"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines"),
  )

Question 2

Which party was the second when the first was the PSOE? And when the first was the PP?

Code
ranked_votes <- election_data_tidy |> 
  group_by(date, municipio) |> 
  arrange(desc(votes)) |> 
  mutate(rank = row_number()) |> 
  ungroup()

# PSOE is first
second_psoe <- ranked_votes |> 
  group_by(date, municipio) |> 
  filter(rank == 1 & party_recoded == "PARTIDO SOCIALISTA OBRERO ESPAÑOL") |> 
  left_join(
    ranked_votes |> 
      filter(rank == 2) |> 
      select(date, municipio, second = party_recoded, votes = votes),
    by = c("date", "municipio")
  ) |> 
  ungroup()

second_psoe_sum <- second_psoe |> 
  group_by(date, second) |> 
  summarize(
    num_municipalities = n(),
    .groups = "drop"
  ) |> 
  arrange(date, desc(num_municipalities))

# PP is first
second_pp <- ranked_votes |> 
  group_by(date, municipio) |> 
  filter(rank == 1 & party_recoded == "PARTIDO POPULAR") |> 
  left_join(
    ranked_votes |> 
      filter(rank == 2) |> 
      select(date, municipio, second = party_recoded, votes = votes),
    by = c("date", "municipio")
  ) |> 
  ungroup()

second_pp_sum <- second_pp |> 
  group_by(date, second) |> 
  summarize(
    num_municipalities = n(),
    .groups = "drop"
  ) |> 
  arrange(date, desc(num_municipalities))
Code
second_combined <- bind_rows(
  second_pp_sum |>  
    mutate(first = "PP"),
  second_psoe_sum |> 
    mutate(first = "PSOE"))

second_combined$date <- factor(second_combined$date, levels = unique(second_combined$date))

# Stacked barplot with facets
gsec <- ggplot(second_combined, aes(x = date, y = num_municipalities, fill = second)) +
  geom_bar(stat = "identity", position = "fill", color = "black") +
  scale_fill_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "BLOQUE NACIONALISTA GALEGO" = "lightblue",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "orange",
      "VOX" = "#83b431",
      "CONVERGENCIA I UNIO" = "#1b348a",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "yellow"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "ERC"
    )
  ) +
  labs(
    title = "Second places when PSOE or PP were first",
    x = "Election Date",
    y = "Number of Municipalities",
    fill = "Second Party"
  ) +
  facet_wrap(~ first, scales = "free_y", labeller = labeller(first = c(PP = "PP First", PSOE = "PSOE First"))) +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 14, face = "bold", family = "Roboto"),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black", angle = 20),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  ) 

gsec
Code
library(ggalluvial)

# Names and siglas for the parties to avoid NA
second_combined <- second_combined |> 
  mutate(
    first = recode(first,
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA" = "ERC"
    ),
    second = recode(second,
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA" = "ERC"
    )) |> 
  filter(!(second == "PNV" | second == "BNG" | second == "ERC"))
  
# New graph
gsec2 <- ggplot(second_combined, aes(
  axis1 = first, axis2 = second,
  y = num_municipalities, fill = second
)) +
  geom_alluvium(aes(fill = second), width = 1/6) +
  geom_stratum(aes(fill = after_stat(stratum)), width = 1/6, color = "black") +
 geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3.5, color = "black", fontface = "bold") +
  scale_fill_manual(
    values = c(
      "PP" = "#1db4e8",
      "PSOE" = "#c30505",
      "Others" = "gray60",
      "Podemos" = "#a444b4",
      "PNV" = "darkgreen",
      "BNG" = "lightblue",
      "C's" = "orange",
      "VOX" = "#83b431",
      "CiU" = "#1b348a",
      "ERC" = "yellow"
    )
  ) +
  labs(
    title = "Flow of municipalities won: First to second party",
    x = "First to Second Party",
    y = "Number of Municipalities",
    fill = "Second Party"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 17, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 12, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 12, family = "Roboto", color = "black"),
    axis.title.x = element_text(size = 13),
    axis.title.y = element_text(size = 13),
    legend.title = element_text(size = 13, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 11, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  )

gsec2

Question 3

Who benefits from low turnout?

Code
election_data_tidy <- election_data_tidy |> 
  group_by(cod_mun, date, party_recoded) |> 
  mutate(
    total_votes = votos_blancos + votos_nulos + votos_candidaturas,
    turnout = total_votes / censo,
    votes_recoded = sum(votes, na.rm = TRUE),
    vote_share_by_party = votes_recoded / total_votes
  ) |> 
  ungroup()

# Let's try to visualise the model in a meaningful way


ggplot(election_data_tidy, aes(x = turnout, y = vote_share_by_party, colour = siglas)) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Party-Specific Trends: Vote Share vs Turnout",
    x = "Turnout",
    y = "Vote Share",
    colour = "Party"
  ) +
  theme_minimal() +
  scale_colour_manual(values = c(
      "PP" = "#1db4e8",
      "PSOE" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "ERC" = "#ffbf41",
      "CIU" = "#1b348a",
      "MP" = "#004938",
      "CS" = "#eb6109",
      "EAJ-PNV" = "darkgreen",
      "BNG" = "lightblue",
      "EH BILDU" = "#03cfb4"
    )) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 17, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 12, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 12, family = "Roboto", color = "black"),
    axis.title.x = element_text(size = 13),
    axis.title.y = element_text(size = 13),
    legend.title = element_text(size = 13, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 11, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  )

Question 4

How to analyze the relationship between census and vote?

[@gómezvalenzuela2023]

  • Rural < 10000 recorded citizens in the census
  • Urban > 10000 recorded citizens in the census
Code
ggplot(election_data_tidy, aes(x = censo, y = vote_share_by_party, colour = siglas)) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Party-Specific Trends: Vote Share vs Census",
    x = "Census",
    y = "Vote Share",
    colour = "Party"
  ) +
  theme_minimal() +
  scale_colour_manual(values = c(
      "PP" = "#1db4e8",
      "PSOE" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "ERC" = "#ffbf41",
      "CIU" = "#1b348a",
      "MP" = "#004938",
      "CS" = "#eb6109",
      "EAJ-PNV" = "darkgreen",
      "BNG" = "lightblue",
      "EH BILDU" = "#03cfb4"
    )) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 17, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 12, family = "Roboto", color = "black"),
    axis.text.y = element_text(size = 12, family = "Roboto", color = "black"),
    axis.title.x = element_text(size = 13),
    axis.title.y = element_text(size = 13),
    legend.title = element_text(size = 13, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 11, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  )

Is it true that certain parties win in rural areas?

Code
# For the second part - rural vs urban

election_data_tidy <- election_data_tidy |> 
  mutate(
    area_type = ifelse(censo < 10000, "Rural", "Urban")
  )
Code
rural_municipalities <- election_data_tidy |>
  filter(area_type == "Rural")

# Winning parties
winners_rural <- rural_municipalities |> 
  group_by(date, municipio) |> 
  slice_max(votes_recoded, n = 1, with_ties = FALSE) |> 
  select(date, party_recoded, municipio, censo)

# Number of municipalities won
winners_by_party_rural <- winners |> 
  group_by(date, party_recoded)|> 
  summarize(num_municipalities = n(), .groups = "drop") |> 
  arrange(date, desc(num_municipalities))|> 
  ungroup() 

urban_municipalities <- election_data_tidy |>
  filter(area_type == "Urban")

# Winning parties
winners_urban <- urban_municipalities |> 
  group_by(date, municipio) |> 
  slice_max(votes_recoded, n = 1, with_ties = FALSE) |> 
  select(date, party_recoded, municipio, censo)

# Number of municipalities won
winners_by_party_urban <- winners_urban |> 
  group_by(date, party_recoded)|> 
  summarize(num_municipalities = n(), .groups = "drop") |> 
  arrange(date, desc(num_municipalities))|> 
  ungroup() 

type_combined <- rbind(
  winners_by_party_rural %>% mutate(type = "Rural"),
  winners_by_party_urban %>% mutate(type = "Urban")
)

type_combined$date <- factor(type_combined$date, levels = unique(type_combined$date))

ggplot(type_combined, aes(x = date, y = num_municipalities, fill = party_recoded)) +
  geom_bar(stat = "identity", position = "fill", color = "black") +
  scale_fill_manual(
    values = c(
      "PARTIDO POPULAR" = "#1db4e8",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "#c30505",
      "OTHER" = "gray60",
      "PODEMOS" = "#a444b4",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "darkgreen",
      "BLOQUE NACIONALISTA GALEGO" = "lightblue",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "orange",
      "VOX" = "#83b431",
      "CONVERGENCIA I UNIO" = "#1b348a",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "yellow"
    ),
    labels = c(
      "PARTIDO POPULAR" = "PP",
      "PARTIDO SOCIALISTA OBRERO ESPAÑOL" = "PSOE",
      "OTHER" = "Others",
      "PODEMOS" = "Podemos",
      "EUZKO ALDERDI JELTZALEA-PARTIDO NACIONALISTA VASCO" = "PNV",
      "BLOQUE NACIONALISTA GALEGO" = "BNG",
      "CIUDADANOS-PARTIDO DE LA CIUDADANIA" = "C's",
      "CONVERGENCIA I UNIO" = "CiU",
      "ESQUERRA REPUBLICANA DE CATALUNYA"= "ERC"
    )
  ) +
  labs(
    title = "Wins depending on the type of the area",
    x = "Election Date",
    y = "Number of Municipalities",
    fill = "Parties"
  ) +
  facet_wrap(~ type, scales = "free_y") +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 14, face = "bold", family = "Roboto"),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, family = "Roboto", margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, family = "Roboto", color = "black", angle = 20),
    axis.text.y = element_text(size = 11, family = "Roboto", color = "black"),
    legend.title = element_text(size = 11, family = "Roboto", face = "bold"),
    legend.text = element_text(size = 10, family = "Roboto"),
    legend.box.background = element_rect(color = "black", size = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines")
  ) 

Question 5

How to calibrate the error of the polls (remember that the polls are voting intentions at national level)?

Code
elections_aggregated2 <- election_data_tidy |> 
  group_by(date, cod_mun) |> 
  distinct(total_votes, .keep_all = TRUE) |> 
  summarize(
    participation = sum(total_votes),
    .groups = "drop"
  ) |> 
  group_by(date) |> 
  summarise(participation_total = sum(participation),
    .groups = "drop"
  )

elections_aggregated1 <- election_data_tidy |> 
  group_by(date, party_recoded, siglas) |> 
  summarize(
    total_votes_all = sum(votes, na.rm = TRUE),
    .groups = "drop"
  ) 

elections_with_shares <- elections_aggregated1 |> 
  left_join(elections_aggregated2, by = "date") |> 
  mutate(vote_share = (total_votes_all / participation_total) * 100)
Code
poll_calibration <- surveys_tidy |> 
  mutate(year_month_elec = floor_date(date_elec, "month")) |> # I will extract year and month (lubridate package)
  left_join(
    elections_with_shares |> 
      mutate(year_month = floor_date(date, "month")),          # I will extract year and month
    by = c("year_month_elec" = "year_month", "party" = "siglas")
  )

poll_calibration <- poll_calibration |> 
  mutate(error = estimation - vote_share)

poll_calibration |> select(error)
# A tibble: 79,086 × 1
     error
     <dbl>
 1 NA     
 2  0.0124
 3 NA     
 4 NA     
 5  6.97  
 6 NA     
 7  4.81  
 8 NA     
 9  2.86  
10 NA     
# ℹ 79,076 more rows

Question 6

Which polling houses got it right the most and which ones deviated the most from the results?

Code
# Error analysis: summary of errors by pollster or any other factor

error_analysis <- poll_calibration |> 
  group_by(pollster) |> 
  summarize(
    mean_error = mean(abs(error), na.rm = TRUE),
    sd_error = sd(error, na.rm = TRUE)
  )
Code
poll_calibration <- poll_calibration |> 
  mutate(abs_error = abs(error))  

pollster_accuracy <- poll_calibration |> 
  group_by(pollster) |> 
  summarize(
    mean_abs_error = mean(abs_error, na.rm = TRUE),# mean abs error column
    sd_abs_error = sd(abs_error, na.rm = TRUE)
  ) |> 
  arrange(mean_abs_error)

# Bar plot to represent the MAE for the hightst 10 pollsters
top_pollsters <- pollster_accuracy %>%
  slice_max(mean_abs_error, n = 10)  

ggplot(top_pollsters, aes(x = reorder(pollster, mean_abs_error), y = mean_abs_error)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  labs(
    title = "Top 10 Pollster with Highest Mean Absolute Error",
    x = "Pollster",
    y = "Mean Absolute Error"
  ) +
  coord_flip() +
  theme_minimal()
Code
# Bar plot to represent the MAE for the lowest 10 pollsters
low_pollsters <- pollster_accuracy %>%
  slice_min(mean_abs_error, n = 10)  

ggplot(low_pollsters, aes(x = reorder(pollster, mean_abs_error), y = mean_abs_error)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  labs(
    title = "Top 10 Pollster with Lowest Mean Absolute Error",
    x = "Pollster",
    y = "Mean Absolute Error"
  ) +
  coord_flip() +
  theme_minimal()

Creative 1

How does polling error vary by party?

Code
polling_error_by_party <- poll_calibration |> 
  group_by(party) |> 
  summarize(
    mean_abs_error = mean(abs_error, na.rm = TRUE),
    sd_abs_error = sd(abs_error, na.rm = TRUE)
  ) |> 
  arrange(mean_abs_error) |> 
  head(10)

polling_error_by_party 
# A tibble: 10 × 3
   party   mean_abs_error sd_abs_error
   <chr>            <dbl>        <dbl>
 1 BNG              0.312        0.276
 2 EAJ-PNV          0.442        1.00 
 3 CIU              0.612        0.852
 4 ERC              0.995        1.99 
 5 VOX              2.04         1.78 
 6 PSOE             2.49         2.74 
 7 PP               2.53         1.88 
 8 MP               2.55         0.930
 9 CS               2.99         2.46 
10 PODEMOS          6.37         3.34 

Creative 2

By finding the most successful two parties for each year, calculate a polarisation index. Then, compare polarisation of vote of no confidence elections with the rest

Code
polarization_calc <- function(data, year) {

if (!year %in% c(2008, 2011, 2015, 2016, 2019, "all")) {
    warning("Hey you! The year has to be one of these values: 2008, 2011, 2015, 2016, 2019, or 'all' (in quotes!) if you want to see the information for all years. Thanks :). Output:")
    return(NULL)
  }  
  
elections_processed <- data |> 
   mutate(votos_candidaturas_complete = 
            votos_blancos + votos_nulos + votos_candidaturas)

elections_aggregated_total <- elections_processed |> 
  group_by(date, cod_mun) |> 
  distinct(votos_candidaturas_complete, .keep_all = TRUE) |> 
  summarize(
    participation = sum(votos_candidaturas_complete),
    .groups = "drop"
  ) |> 
  group_by(date) |> 
  summarise(participation_total = sum(participation),
    .groups = "drop" 
  )

elections_aggregated_parties <- elections_processed |> 
  group_by(date, party_recoded) |> 
  summarize(
    total_votes_all = sum(votes, na.rm = TRUE),
    .groups = "drop"
  ) 

elections_top_parties <- 
  elections_aggregated_parties |> 
  group_by(date) |> 
  slice_max(total_votes_all, n = 2) |> 
  summarise(top_parties_votes = sum(total_votes_all))
  
  polarization_index <- elections_top_parties |> 
    left_join(elections_aggregated_total, by = "date") |> 
    mutate(
      polarization_index = top_parties_votes / participation_total
    )
  if (year != "all") {
    polarization_index <- 
      polarization_index |> 
      filter(year(date) == year)
  }
 return(polarization_index)
}
Code
polarization_all <-
  polarization_calc(election_data_tidy, 'all')

polarization_all <-
  polarization_all |> 
  mutate(month_year = glue("{month(date)}-{year(date)}"))

unique_dates <- unique(election_data_tidy$date)

graph_info <- data.frame(
  start_date = unique_dates,
  end_date = c(unique_dates[-1], tail(unique_dates, 1)),
  trend = c(
    "Better Multiparty System",  
    "Better Multiparty System",                  
    "Worse Multiparty System",                  
    "Better Multiparty System",                    
    "Worse Multiparty System", 
    "Nothing")  
)

graph <- polarization_all |> 
  ggplot() +
  aes(x = date, y = polarization_index) +
  geom_point(size = 2, shape = 1) +
  geom_line() +
  geom_rect(data = graph_info,
            aes(xmin = start_date, xmax = end_date, ymin = -Inf, ymax = Inf, fill = trend),
            alpha = 0.3, inherit.aes = FALSE) +
  geom_vline(data = graph_info, aes(xintercept = as.numeric(start_date)),
             color = "gray50", linetype = "dashed", linewidth = 0.4) +
  scale_x_continuous(breaks = c(2008, 2011, 2015, 2016, 2019),
  ) +
  labs(
    title = "Polarization Index trend in General Elections",
    x = "Date of Election",
    y = "Polarization Index",
    fill = "Polarization trend"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5, margin = margin(b = 20)),
    axis.text.x = element_text(size = 11, color = "black"),
    axis.text.y = element_text(size = 11, color = "black"),
    axis.title.x = element_text(size = 10, face = "bold"),
    axis.title.y = element_text(size = 10, face = "bold"),
    legend.title = element_text(size = 11, face = "bold"),
    legend.text = element_text(size = 10),
    legend.box.background = element_rect(color = "black", linewidth = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines"),
    panel.grid = element_blank()
  )

  
  # Changing the colors of legends and background filling while getting rid of 'nothing' key manually
  
  graph <- 
    graph +
    scale_fill_manual(values = c(
    "Better Multiparty System" = "#31a354",
    "Worse Multiparty System" = "#de2d26")) 
      

graph <-
  graph +
  theme(
    legend.justification = "center",
    legend.title = element_text(hjust = 0.5),
    legend.position = "top",  
    legend.direction = "horizontal",
    legend.box = "vertical",
    legend.title.position = "top"
    )
         

graph <-
  graph + 
  geom_text_repel(aes(label = polarization_all$month_year),
            box.padding = 0.5,
            segment.size = 0.1,
            nudge_y = -0.05)
 
graph

Creative 3

How has the vote in Catalonia changed over the years studied?

Code
# Important parties not included in the previous recodification
new_parties <- c("JUNTS PER CATALUNYA-JUNTS", 
                 "CONVERGÈNCIA DEMOCRÀTICA DE CATALUNYA")

catalunya <- election_data_tidy |>
  mutate(
    party_recoded = case_when(
      party %in% new_parties ~ party,
      TRUE ~ party_recoded
    ),
    siglas = case_when(
      party_recoded == "JUNTS PER CATALUNYA-JUNTS" ~ "CIU",
      party_recoded == "CONVERGÈNCIA DEMOCRÀTICA DE CATALUNYA" ~ "CIU", #included in CiU
      TRUE ~ siglas
    ),
    province = case_when(
      codigo_provincia == "08" ~ "Barcelona",
      codigo_provincia == "17" ~ "Girona",
      codigo_provincia == "25" ~ "Lleida",
      codigo_provincia == "43" ~ "Tarragona",
      TRUE ~ "Unknown")) |> 
  filter(codigo_ccaa == "09") |> 
  distinct(date, cod_mun, party_recoded, .keep_all = TRUE)

catalunya_votes <- catalunya |>
  filter(party_recoded != "OTHER") |> 
  group_by(date, province, siglas) |> 
  summarize(total_votes = sum(votes, na.rm = TRUE), .groups = "drop") |> 
  group_by(date, province) |> 
  mutate(vote_percentage_parties = (total_votes / sum(total_votes)) * 100) |> 
  ungroup()

gcat <- ggplot(catalunya_votes, aes(x = date, y = vote_percentage_parties, 
                                 colour = siglas, group = siglas)) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  facet_wrap(~ province, ncol = 2) +
  scale_color_manual(
    values = c(
      "PP" = "#1db4e8",
      "PSOE" = "#c30505",
      "PODEMOS" = "#a444b4",
      "VOX" = "#83b431",
      "ERC" = "#ffbf41",
      "CIU" = "#1b348a",
      "MP" = "#004938",
      "CS" = "#eb6109"
      )) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(
    title = "General elections in Catalonia over the years",
    x = "Date of the election",
    y = "Vote Share (%)",
    color = "Party",
    caption = "Source: Electoral Data"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", 
                              hjust = 0.5, margin = margin(b = 20)),
    strip.text = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 10, face = "bold"),
    axis.text.x = element_text(size = 8, color = "black", angle = 45, hjust = 1),
    axis.text.y = element_text(size = 8, color = "black"),
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 8),
    legend.box.background = element_rect(color = "black", linewidth = 0.5),
    plot.margin = margin(15, 15, 15, 15),
    legend.key.size = unit(1, "lines"))

gcat

What happens if we group the parties into pro-independence and non-independence parties?

Code
independentist_parties <- c("ERC", "CIU") #just ERC and CIU

dataset <- catalunya_votes |> 
  mutate(independentist = ifelse(siglas %in% independentist_parties, 
                                 "Independentist", "Non-Independentist"))

# Percentage of vote per bloc and province
votos_porcentaje_prov <- dataset |> 
  group_by(date, independentist, province) |> 
  summarise(total_votes = sum(total_votes), .groups = "drop") |> 
  group_by(date, province) |> 
  mutate(percentage = (total_votes / sum(total_votes)) * 100)

# Graph comparing the blocs and the provinces
gcat2 <- ggplot(votos_porcentaje_prov, aes(x = date, y = percentage, color = independentist, group = independentist)) +
  geom_line(size = 1.2) +  
  geom_point(size = 2) +   
  scale_color_manual(
    values = c("Independentist" = "darkblue", "Non-Independentist" = "darkred"),
    labels = c("Independentist" = "Pro-Independence", "Non-Independentist" = "Non-Independence")
  ) +
  labs(
    title = "Evolution of Pro-Independence vs Non-Independence Vote",
    x = "Election Year",
    y = "Percentage of Votes (%)",
    color = "Political Bloc:",
    caption = "Source: Electoral Data"
  ) +
  facet_wrap(~ province, ncol = 2) +  
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    axis.title = element_text(size = 10, face = "bold"),
    axis.text.x = element_text(size = 8, color = "black", angle = 45, hjust = 1),
    axis.text.y = element_text(size = 8, color = "black"),
    legend.title = element_text(size = 10, face = "bold"),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    legend.position = "top",
    strip.text = element_text(size = 12, face = "bold"),  
    plot.margin = margin(10, 10, 10, 10)
  )

gcat2

Creative 4